/*
  author Sylvain Bertrand <sylvain.bertrand@gmail.com>
  Protected by linux GNU GPLv2
  Copyright 2012-2014
*/
#include <linux/pci.h>
#include <linux/cdev.h>
#include <linux/delay.h>

#include <alga/rng_mng.h>
#include <uapi/alga/pixel_fmts.h>
#include <alga/timing.h>
#include <alga/amd/atombios/atb.h>
#include <uapi/alga/amd/dce6/dce6.h>

#include "mc.h"
#include "rlc.h"
#include "ih.h"
#include "fence.h"
#include "ring.h"
#include "dmas.h"
#include "ba.h"
#include "cps.h"
#include "gpu.h"
#include "drv.h"

#include "tiling.h"

#include "regs.h"

void gpu_cfg_init(struct pci_dev *dev)
{
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);

	switch (dd->family) {
	case TAHITI:
		dd->cfg.gpu.ses_n = 2;
		dd->cfg.gpu.se_shs_n = 2;
		dd->cfg.gpu.se_rbs_n = 4;
		dd->cfg.gpu.sh_cus_n = 8;
		dd->cfg.gpu.hw_ctxs_n = 8;

		dd->cfg.gpu.sc_prim_fifo_sz_frontend = 0x20;
		dd->cfg.gpu.sc_prim_fifo_sz_backend = 0x100;
		dd->cfg.gpu.sc_hiz_tile_fifo_sz = 0x30;
		dd->cfg.gpu.sc_earlyz_tile_fifo_sz = 0x130;
		break;
	case PITCAIRN:
		dd->cfg.gpu.ses_n = 2;
		dd->cfg.gpu.se_shs_n = 2;
		dd->cfg.gpu.se_rbs_n = 4;
		dd->cfg.gpu.sh_cus_n = 5;
		dd->cfg.gpu.hw_ctxs_n = 8;

		dd->cfg.gpu.sc_prim_fifo_sz_frontend = 0x20;
		dd->cfg.gpu.sc_prim_fifo_sz_backend = 0x100;
		dd->cfg.gpu.sc_hiz_tile_fifo_sz = 0x30;
		dd->cfg.gpu.sc_earlyz_tile_fifo_sz = 0x130;
		break;
	case VERDE:
		dd->cfg.gpu.ses_n = 1;
		dd->cfg.gpu.se_shs_n = 2;
		dd->cfg.gpu.se_rbs_n = 4;
		dd->cfg.gpu.sh_cus_n = 5;
		dd->cfg.gpu.hw_ctxs_n = 8;

		dd->cfg.gpu.sc_prim_fifo_sz_frontend = 0x20;
		dd->cfg.gpu.sc_prim_fifo_sz_backend = 0x40;
		dd->cfg.gpu.sc_hiz_tile_fifo_sz = 0x30;
		dd->cfg.gpu.sc_earlyz_tile_fifo_sz = 0x130;
		break;
	case OLAND:
		dd->cfg.gpu.ses_n = 1;
		dd->cfg.gpu.se_shs_n = 1;
		dd->cfg.gpu.se_rbs_n = 2;
		dd->cfg.gpu.sh_cus_n = 6;
		dd->cfg.gpu.hw_ctxs_n = 8;

		dd->cfg.gpu.sc_prim_fifo_sz_frontend = 0x20;
		dd->cfg.gpu.sc_prim_fifo_sz_backend = 0x40;
		dd->cfg.gpu.sc_hiz_tile_fifo_sz = 0x30;
		dd->cfg.gpu.sc_earlyz_tile_fifo_sz = 0x130;
		break;
	}
}

void gpu_soft_reset(struct pci_dev *dev)
{
	u32 grbm_reset;
	u32 tmp;

	/* reset all the gfx blocks */
	grbm_reset = (	GSR_SOFT_RESET_CP |
			GSR_SOFT_RESET_RLC |
			GSR_SOFT_RESET_CB |
			GSR_SOFT_RESET_DB |
			GSR_SOFT_RESET_GDS |
			GSR_SOFT_RESET_PA |
			GSR_SOFT_RESET_SC |
			GSR_SOFT_RESET_BCI |
			GSR_SOFT_RESET_SPI |
			GSR_SOFT_RESET_SX |
			GSR_SOFT_RESET_TC |
			GSR_SOFT_RESET_TA |
			GSR_SOFT_RESET_VGT |
			GSR_SOFT_RESET_IA);

	tmp=rr32(dev, GRBM_SOFT_RESET);
	tmp|=grbm_reset;
	wr32(dev, tmp, GRBM_SOFT_RESET);
	tmp=rr32(dev, GRBM_SOFT_RESET);
	udelay(50);

	tmp &= ~grbm_reset;
	wr32(dev, tmp, GRBM_SOFT_RESET);
	rr32(dev, GRBM_SOFT_RESET);
	udelay(50);
}

/*
 * this is the actual GPU silicium block, the one which does 3d and compute
 * not the whole chip
 */

static void se_sh_select(struct pci_dev *dev, u32 se_idx, u32 sh_idx)
{
	u32 grbm_gfx_idx = GGI_INST_BROADCAST_WRS;

	if ((se_idx == 0xffffffff) && (sh_idx == 0xffffffff))
		grbm_gfx_idx = GGI_SH_BROADCAST_WRS | GGI_SE_BROADCAST_WRS;
	else if (se_idx == 0xffffffff)
		grbm_gfx_idx |= GGI_SE_BROADCAST_WRS | set(GGI_SH_IDX, sh_idx);
	else if (sh_idx == 0xffffffff)
		grbm_gfx_idx |= GGI_SH_BROADCAST_WRS | set(GGI_SE_IDX, se_idx);
	else
		grbm_gfx_idx |= set(GGI_SH_IDX, sh_idx)
						| set(GGI_SE_IDX, se_idx);
	wr32(dev, grbm_gfx_idx, GRBM_GFX_IDX);
}

/*
 * bitwidth=0-->mask=0x0
 * bitwidth=1-->mask=0x1
 * bitwidth=2-->mask=0x3
 * ...
 */
static u32 bitmask_create(u32 bit_width)
{
	u32 i;
	u32 mask;

	mask = 0;
	for (i = 0; i < bit_width; ++i) {
		mask <<= 1;
		mask |= 1;
	}
	return mask;
}

/* the mask of disabled rbs of the **selected sh** */
static u32 sh_rbs_dis_get(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	u32 rbs_dis;
	u32 mask;
	u32 backend_dis_shift;

	dd = pci_get_drvdata(dev);

	/* disabling a rb for a sh is done at 2 levels: CC_ and GC_USER_ */
	rbs_dis = rr32(dev, CC_RB_BACKEND_DIS);
	if (rbs_dis & CRBD_BACKEND_DIS_VALID)
		rbs_dis &= CRBD_BACKEND_DIS;
	else
		rbs_dis = 0;
	rbs_dis |= rr32(dev, GC_USER_RB_BACKEND_DIS);

	/* don't use get(), since we may have major bits on top */
	backend_dis_shift = ffs(CRBD_BACKEND_DIS);
	rbs_dis >>= backend_dis_shift;

	/* get a bit mask for the rbs which are disabled for this sh */
	mask = bitmask_create(dd->cfg.gpu.se_rbs_n / dd->cfg.gpu.se_shs_n);
	return rbs_dis & mask;
}

/* get enabled cus for the **selected sh** */
static u32 cus_ena_get(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	u32 cus_dis;
	u32 mask;
	u32 inactive_cus_shift;

	dd = pci_get_drvdata(dev);

	/* disabling a cu for a sh is done at 2 levels: CC_ and GC_USER_ */
	cus_dis = rr32(dev, CC_GC_SHADER_ARRAY_CFG);
	if (cus_dis & CGSAC_INACTIVE_CUS_VALID)
		cus_dis &= CGSAC_INACTIVE_CUS;
	else
		cus_dis = 0;
	cus_dis |= rr32(dev, GC_USER_SHADER_ARRAY_CFG);

	inactive_cus_shift = ffs(CGSAC_INACTIVE_CUS);
	cus_dis >>= inactive_cus_shift;

	mask = bitmask_create(dd->cfg.gpu.sh_cus_n);
	return ~cus_dis & mask;
}

static void spi_setup(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	u32 se;
	u32 sh;
	u32 cu;
	u32 spi_static_thd_mgmt_2;
	u32 cus_ena;
	u32 mask;

	dd = pci_get_drvdata(dev);

	for (se = 0; se < dd->cfg.gpu.ses_n; ++se) {
		for (sh = 0; sh < dd->cfg.gpu.se_shs_n; ++sh) {
			se_sh_select(dev, se, sh);
			spi_static_thd_mgmt_2 = rr32(dev,
							SPI_STATIC_THD_MGMT_2);
			cus_ena = cus_ena_get(dev);

			mask = 1;
			for (cu = 0; cu < CGSAC_SH_CUS_N_MAX; ++cu) {
				mask <<= cu;
				if (cus_ena & mask) {
					spi_static_thd_mgmt_2 &= ~mask;
					wr32(dev, spi_static_thd_mgmt_2, 
							SPI_STATIC_THD_MGMT_2);
					break;
				}
			}
		}
	}
	se_sh_select(dev, 0xffffffff, 0xffffffff);

	wr32(dev, set(SCC_VTX_DONE_DELAY, SCC_DELAY_22_CLKS), SPI_CFG_CTL_1);
}

static void rbs_setup(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	u32 se;
	u32 sh;
	u32 sh_rbs_dis;
	u32 rbs_dis;
	u32 mask;
	u32 rbs_ena;
	u32 pa_sc_raster_cfg;

	dd = pci_get_drvdata(dev);

	/* build the mask of dis rbs from regs */
	rbs_dis = 0;
	for (se = 0; se < dd->cfg.gpu.ses_n; ++se) {
		for (sh = 0; sh <  dd->cfg.gpu.se_shs_n; ++sh) {
			se_sh_select(dev, se, sh);
			sh_rbs_dis = sh_rbs_dis_get(dev);
			rbs_dis |= sh_rbs_dis << ((se * dd->cfg.gpu.se_shs_n
					+ sh) * CRBD_TAHITI_RB_BITMAP_W_PER_SH);
		}
	}

	se_sh_select(dev, 0xffffffff, 0xffffffff);

	/* build the mask of ena rbs from the mask of dis rbs */
	rbs_ena = 0;
	mask = 1;
	for (se = 0; se < dd->cfg.gpu.se_rbs_n * dd->cfg.gpu.ses_n; ++se) {
		if(!(rbs_dis & mask))
			rbs_ena |= mask;
		mask <<= 1;
	}

	/* configure the raster blk for each sh */
	for (se = 0; se < dd->cfg.gpu.ses_n; ++se) {
		pa_sc_raster_cfg = 0;
		for (sh = 0; sh < dd->cfg.gpu.se_shs_n; ++sh) {
			switch (rbs_ena & 3) {
			case 1:
				pa_sc_raster_cfg |= (PSRC_RB_MAP_0
						<< (se * dd->cfg.gpu.se_shs_n
								+ sh) * 2);
				break;
			case 2:
				pa_sc_raster_cfg |= (PSRC_RB_MAP_3
						<< (se * dd->cfg.gpu.se_shs_n
								+ sh) * 2);
				break;
			case 3:
			default:
				pa_sc_raster_cfg |= (PSRC_RB_MAP_2
				<< (se * dd->cfg.gpu.se_shs_n + sh) * 2);
				break;
			}
			rbs_ena >>= 2;/* ena rbs of next sh */
		}
		se_sh_select(dev, se, 0xffffffff);
		dev_info(&dev->dev, "se=%u pa_sc_raster_cfg=0x%08x\n", se,
							pa_sc_raster_cfg);
		wr32(dev, pa_sc_raster_cfg, PA_SC_RASTER_CFG);
	}
	se_sh_select(dev, 0xffffffff, 0xffffffff);
}

/*
 * gpu register defaults which need to be inited only once, most of them
 * are un the cfg space (PA_SC_RASTER_CFG is in the ctx space though)
 */
void gpu_defaults(struct pci_dev *dev, u32 addr_cfg, u32 mem_row_sz_kb)
{
	struct dev_drv_data *dd;
	u32 sx_debug_1;

	dd = pci_get_drvdata(dev);

	wr32(dev, set(GC_RD_TIMEOUT, 0xff), GRBM_CTL);
	wr32(dev, addr_cfg, GB_ADDR_CFG);

	tiling_modes_tbl_init(dev, mem_row_sz_kb);
	rbs_setup(dev);
	spi_setup(dev);

	sx_debug_1 = rr32(dev, SX_DEBUG_1);
	wr32(dev, sx_debug_1, SX_DEBUG_1);

	/* cfg space */
	wr32(dev, set(PSFS_SC_FRONTEND_PRIM_FIFO_SZ,
					dd->cfg.gpu.sc_prim_fifo_sz_frontend)
		| set(PSFS_SC_BACKEND_PRIM_FIFO_SZ,
					dd->cfg.gpu.sc_prim_fifo_sz_backend)
		| set(PSFS_SC_HIZ_TILE_FIFO_SZ, dd->cfg.gpu.sc_hiz_tile_fifo_sz)
		| set(PSFS_SC_EARLYZ_TILE_FIFO_SZ,
					dd->cfg.gpu.sc_earlyz_tile_fifo_sz),
								PA_SC_FIFO_SZ);
	wr32(dev, 1, VGT_INSTS_N);

	wr32(dev, 0, SQ_CFG);

	wr32(dev, set(PSFEMC_FORCE_EOV_MAX_CLK_CNT, 4095)
				| set(PSFEMC_FORCE_EOV_MAX_REZ_CNT, 255),
						PA_SC_FORCE_EOV_MAX_CNTS);


	wr32(dev, set(VCI_CACHE_INVALIDATION, VCI_VC_AND_TC)
				| set(VCI_AUTO_INVLD_ENA, VCI_ES_AND_GS_AUTO),
							VGT_CACHE_INVALIDATION);

	wr32(dev, 16, VGT_GS_VTX_REUSE);
	wr32(dev, 0, PA_SC_LINE_STIPPLE_STATE);

	wr32(dev, 0, CB_PERF_CTR_0_SEL_0);
	wr32(dev, 0, CB_PERF_CTR_0_SEL_1);
	wr32(dev, 0, CB_PERF_CTR_1_SEL_0);
	wr32(dev, 0, CB_PERF_CTR_1_SEL_1);
	wr32(dev, 0, CB_PERF_CTR_2_SEL_0);
	wr32(dev, 0, CB_PERF_CTR_2_SEL_1);
	wr32(dev, 0, CB_PERF_CTR_3_SEL_0);
	wr32(dev, 0, CB_PERF_CTR_3_SEL_1);

	wr32(dev, PCE_CLIP_VTX_REORDER_ENA | set(PCE_CLIP_SEQ_N, 3),
								PA_CL_ENHANCE);

	udelay(50);
}

void gpu_mgcg_dis(struct pci_dev *dev)
{
	u32 cur;
	u32 want;

	cur = rr32(dev, CGTS_SM_CTL);
	want = cur | CSC_LS_OVERRIDE | CSC_OVERRIDE;
	if (cur != want)
		wr32(dev, want, CGTS_SM_CTL);
}

void gpu_mgcg_ena(struct pci_dev *dev)
{
	u32 cur;
	u32 want;

	cur = rr32(dev, CGTS_SM_CTL);
	want = 0x96940200;
	if (cur != want)
		wr32(dev, want, CGTS_SM_CTL);
}

void gpu_lb_pw_dis(struct pci_dev *dev)
{
	se_sh_select(dev, 0xffffffff, 0xffffffff);
	wr32(dev, 0x00ff, SPI_LB_CU_MASK);
}

void gpu_grdbm_int_reset(struct pci_dev *dev)
{
	wr32(dev, 0, GRBM_INT_CTL);
}
